In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Set figure aesthetics
sns.set_style("white", {'ytick.major.size': 10.0})
sns.set_context("poster", font_scale=1.1)
In [3]:
# Load the data into DataFrames
path = '../data/'
train_users = pd.read_csv(path + 'train_users.csv')
test_users = pd.read_csv(path + 'test_users.csv')
sessions = pd.read_csv(path + 'sessions.csv')
In [4]:
# Merge train and test users
users = pd.concat((train_users, test_users), axis=0, ignore_index=True)
In [5]:
print("We have", len(users), "users and", len(sessions.user_id.unique()), "ID's in the session set.")
In [6]:
print(sessions.shape)
In [7]:
sessions.isnull().sum()
Out[7]:
In [8]:
sessions.loc[sessions['user_id'].isnull()].head()
Out[8]:
In [9]:
sessions.action_type.unique()
Out[9]:
In [10]:
sessions['action_type'] = sessions['action_type'].replace('-unknown-', np.nan)
In [11]:
# list(sessions.action.unique())
In [12]:
sessions.action.value_counts().head()
Out[12]:
In [13]:
sessions.action_type.value_counts()
Out[13]:
In [14]:
sessions.loc[sessions.action_type == 'booking_response']
Out[14]:
In [15]:
sessions.device_type.value_counts()
Out[15]:
In [16]:
train_users.set_index('id', inplace=True)
sessions.set_index('user_id', inplace=True)
In [17]:
users_with_destination = train_users.loc[train_users['country_destination'] != 'NDF']
a = users_with_destination.index.values
b = sessions.index.unique()
sessions_id = list(set(a).intersection(b))
In [18]:
users_with_destination_sessions = sessions.loc[sessions_id]
In [19]:
users_with_destination_sessions.action.value_counts()
Out[19]:
In [ ]:
# When there is a booking what is the most probable action
In [ ]:
# Elapsed Seconds